{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.6.9 64-bit",
   "metadata": {
    "interpreter": {
     "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "source": [
    "# 基于多属性注意力机制的匹配模型"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "# Import modules 导入模块\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import csv\n",
    "import py_entitymatching as em\n",
    "import os"
   ],
   "cell_type": "code",
   "metadata": {},
   "execution_count": 1,
   "outputs": []
  },
  {
   "source": [
    "## 预处理"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 读取数据"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "filepath1 = \"datasets/DBLP-Scholar/DBLP1.csv\"\n",
    "filepath2 = \"datasets/DBLP-Scholar/Scholar.csv\"\n",
    "filepath3 = \"datasets/DBLP-Scholar/DBLP-Scholar_perfectMapping.csv\""
   ]
  },
  {
   "source": [
    "**注意文件编码格式**:其中`DBLP-Scholar/Scholar.csv`文件的编码尝试为`iso8859-1`,可能是其中的`authors`一栏中有作者的名字为法语,编码不是常见的`utf-8`\n",
    "\n",
    "文件的读取直接使用`pandan`的`read_csv`函数读出,然后使用`.head()`查看部分数据"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "dblp = pd.read_csv(filepath1, encoding=\"iso8859-1\")\n",
    "scholar = pd.read_csv(filepath2)\n",
    "dblp_scholar_map = pd.read_csv(filepath3)"
   ]
  },
  {
   "source": [
    "### 查看数据格式"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "                             id  \\\n",
       "0  conf/vldb/RusinkiewiczKTWM95   \n",
       "1  journals/sigmod/EisenbergM02   \n",
       "2          conf/vldb/AmmannJR95   \n",
       "3         journals/sigmod/Liu02   \n",
       "4      journals/sigmod/Hammer02   \n",
       "\n",
       "                                                                                       title  \\\n",
       "0                   Towards a Cooperative Transaction Model - The Cooperative Activity Model   \n",
       "1                                                            SQL/XML is Making Good Progress   \n",
       "2        Using Formal Methods to Reason about Semantics-Based Decompositions of Transactions   \n",
       "3                                                                             Editor's Notes   \n",
       "4  Report on the ACM Fourth International Workshop on Data Warehousing and OLAP (DOLAP 2001)   \n",
       "\n",
       "                                              authors          venue  year  \n",
       "0  M Rusinkiewicz, W Klas, T Tesch, J Wï¿½sch, P Muth           VLDB  1995  \n",
       "1                               A Eisenberg, J Melton  SIGMOD Record  2002  \n",
       "2                          P Ammann, S Jajodia, I Ray           VLDB  1995  \n",
       "3                                               L Liu  SIGMOD Record  2002  \n",
       "4                                                 NaN            NaN  2002  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>title</th>\n      <th>authors</th>\n      <th>venue</th>\n      <th>year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>conf/vldb/RusinkiewiczKTWM95</td>\n      <td>Towards a Cooperative Transaction Model - The Cooperative Activity Model</td>\n      <td>M Rusinkiewicz, W Klas, T Tesch, J Wï¿½sch, P Muth</td>\n      <td>VLDB</td>\n      <td>1995</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>journals/sigmod/EisenbergM02</td>\n      <td>SQL/XML is Making Good Progress</td>\n      <td>A Eisenberg, J Melton</td>\n      <td>SIGMOD Record</td>\n      <td>2002</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>conf/vldb/AmmannJR95</td>\n      <td>Using Formal Methods to Reason about Semantics-Based Decompositions of Transactions</td>\n      <td>P Ammann, S Jajodia, I Ray</td>\n      <td>VLDB</td>\n      <td>1995</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>journals/sigmod/Liu02</td>\n      <td>Editor's Notes</td>\n      <td>L Liu</td>\n      <td>SIGMOD Record</td>\n      <td>2002</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>journals/sigmod/Hammer02</td>\n      <td>Report on the ACM Fourth International Workshop on Data Warehousing and OLAP (DOLAP 2001)</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2002</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "dblp.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "             id  \\\n",
       "0  aKcZKwvwbQwJ   \n",
       "1  ixKfiTHoaDoJ   \n",
       "2  3BxllB4wwcIJ   \n",
       "3  d2WWxwKMex4J   \n",
       "4  cZCX-AQpjccJ   \n",
       "\n",
       "                                                                                               title  \\\n",
       "0                                                                         11578 Sorrento Valley Road   \n",
       "1                                                                Initiation of crazes in polystyrene   \n",
       "2  Immunogold labelling is a quantitative method as demonstrated by studies on aminopeptidase N in     \n",
       "3          The Burden of Infectious Disease Among Inmates and Releasees From Correctional Facilities   \n",
       "4                                            The Role of Faculty Advising in Science and Engineering   \n",
       "\n",
       "                                            authors  \\\n",
       "0                                            QD Inc   \n",
       "1                             AS Argon, JG Hannoosh   \n",
       "2  GH Hansen, LL Wetterberg, H SjÃ¶strÃ¶m, O NorÃ©n   \n",
       "3                    TM Hammett, P Harmon, W Rhodes   \n",
       "4                                        JR Cogdell   \n",
       "\n",
       "                                       venue    year  \n",
       "0                                 San Diego,     NaN  \n",
       "1                                 Phil. Mag,     NaN  \n",
       "2                 The Histochemical Journal,  1992.0  \n",
       "3                                        see     NaN  \n",
       "4  NEW DIRECTIONS FOR TEACHING AND LEARNING,  1995.0  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>title</th>\n      <th>authors</th>\n      <th>venue</th>\n      <th>year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>aKcZKwvwbQwJ</td>\n      <td>11578 Sorrento Valley Road</td>\n      <td>QD Inc</td>\n      <td>San Diego,</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>ixKfiTHoaDoJ</td>\n      <td>Initiation of crazes in polystyrene</td>\n      <td>AS Argon, JG Hannoosh</td>\n      <td>Phil. Mag,</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3BxllB4wwcIJ</td>\n      <td>Immunogold labelling is a quantitative method as demonstrated by studies on aminopeptidase N in</td>\n      <td>GH Hansen, LL Wetterberg, H SjÃ¶strÃ¶m, O NorÃ©n</td>\n      <td>The Histochemical Journal,</td>\n      <td>1992.0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>d2WWxwKMex4J</td>\n      <td>The Burden of Infectious Disease Among Inmates and Releasees From Correctional Facilities</td>\n      <td>TM Hammett, P Harmon, W Rhodes</td>\n      <td>see</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>cZCX-AQpjccJ</td>\n      <td>The Role of Faculty Advising in Science and Engineering</td>\n      <td>JR Cogdell</td>\n      <td>NEW DIRECTIONS FOR TEACHING AND LEARNING,</td>\n      <td>1995.0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "source": [
    "scholar.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "                               idDBLP     idScholar\n",
       "0                conf/sigmod/AbadiC02  f2Lea-RN8dsJ\n",
       "1  conf/sigmod/AbadiCCCCEGHMRSSTXYZ03  eBnT7lhV2LwJ\n",
       "2  conf/sigmod/AbadiCCCCEGHMRSSTXYZ03  gBVNSFeS4P8J\n",
       "3  conf/sigmod/AbadiCCCCEGHMRSSTXYZ03  VuY9Y49GqXgJ\n",
       "4         conf/sigmod/AbiteboulBCMM03  AxpQwgyRyLgJ"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>idDBLP</th>\n      <th>idScholar</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>conf/sigmod/AbadiC02</td>\n      <td>f2Lea-RN8dsJ</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>conf/sigmod/AbadiCCCCEGHMRSSTXYZ03</td>\n      <td>eBnT7lhV2LwJ</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>conf/sigmod/AbadiCCCCEGHMRSSTXYZ03</td>\n      <td>gBVNSFeS4P8J</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>conf/sigmod/AbadiCCCCEGHMRSSTXYZ03</td>\n      <td>VuY9Y49GqXgJ</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>conf/sigmod/AbiteboulBCMM03</td>\n      <td>AxpQwgyRyLgJ</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 6
    }
   ],
   "source": [
    "dblp_scholar_map.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# set metadata/设置 metadata\n",
    "em.set_key(dblp, 'id')\n",
    "em.set_key(scholar, 'id')\n",
    "\n",
    "# set title and artists to lower case/将'title'和'artists'转换为小写\n",
    "dblp[\"title\"] = dblp[\"title\"].str.lower()\n",
    "scholar[\"title\"] = scholar[\"title\"].str.lower()\n",
    "\n",
    "dblp[\"authors\"] = dblp[\"authors\"].str.lower()\n",
    "scholar[\"authors\"] = scholar[\"authors\"].str.lower()\n",
    "\n",
    "# preprocessing -- set all year to be interger/预处理,将年份变为整数\n",
    "def short_year(x):\n",
    "    \"\"\" \n",
    "    if x is nan, then `x == x` will be False\n",
    "    如果`x`是nan类型的,那么x==x这个比较会得到False, Interesting!\n",
    "    \"\"\"\n",
    "    if x == x:\n",
    "        return int(x)\n",
    "    return 0\n",
    "\n",
    "scholar[\"year\"] = scholar[\"year\"].apply(short_year)\n",
    "dblp[\"year\"] = dblp[\"year\"].apply(short_year)"
   ]
  },
  {
   "source": [
    "check the two set after preprocessing/在预处理之后检查两个数据集"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "                             id  \\\n",
       "0  conf/vldb/RusinkiewiczKTWM95   \n",
       "1  journals/sigmod/EisenbergM02   \n",
       "2          conf/vldb/AmmannJR95   \n",
       "3         journals/sigmod/Liu02   \n",
       "4      journals/sigmod/Hammer02   \n",
       "\n",
       "                                                                                       title  \\\n",
       "0                   towards a cooperative transaction model - the cooperative activity model   \n",
       "1                                                            sql/xml is making good progress   \n",
       "2        using formal methods to reason about semantics-based decompositions of transactions   \n",
       "3                                                                             editor's notes   \n",
       "4  report on the acm fourth international workshop on data warehousing and olap (dolap 2001)   \n",
       "\n",
       "                                              authors          venue  year  \n",
       "0  m rusinkiewicz, w klas, t tesch, j wï¿½sch, p muth           VLDB  1995  \n",
       "1                               a eisenberg, j melton  SIGMOD Record  2002  \n",
       "2                          p ammann, s jajodia, i ray           VLDB  1995  \n",
       "3                                               l liu  SIGMOD Record  2002  \n",
       "4                                                 NaN            NaN  2002  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>title</th>\n      <th>authors</th>\n      <th>venue</th>\n      <th>year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>conf/vldb/RusinkiewiczKTWM95</td>\n      <td>towards a cooperative transaction model - the cooperative activity model</td>\n      <td>m rusinkiewicz, w klas, t tesch, j wï¿½sch, p muth</td>\n      <td>VLDB</td>\n      <td>1995</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>journals/sigmod/EisenbergM02</td>\n      <td>sql/xml is making good progress</td>\n      <td>a eisenberg, j melton</td>\n      <td>SIGMOD Record</td>\n      <td>2002</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>conf/vldb/AmmannJR95</td>\n      <td>using formal methods to reason about semantics-based decompositions of transactions</td>\n      <td>p ammann, s jajodia, i ray</td>\n      <td>VLDB</td>\n      <td>1995</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>journals/sigmod/Liu02</td>\n      <td>editor's notes</td>\n      <td>l liu</td>\n      <td>SIGMOD Record</td>\n      <td>2002</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>journals/sigmod/Hammer02</td>\n      <td>report on the acm fourth international workshop on data warehousing and olap (dolap 2001)</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>2002</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "dblp.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "             id  \\\n",
       "0  aKcZKwvwbQwJ   \n",
       "1  ixKfiTHoaDoJ   \n",
       "2  3BxllB4wwcIJ   \n",
       "3  d2WWxwKMex4J   \n",
       "4  cZCX-AQpjccJ   \n",
       "\n",
       "                                                                                               title  \\\n",
       "0                                                                         11578 sorrento valley road   \n",
       "1                                                                initiation of crazes in polystyrene   \n",
       "2  immunogold labelling is a quantitative method as demonstrated by studies on aminopeptidase n in     \n",
       "3          the burden of infectious disease among inmates and releasees from correctional facilities   \n",
       "4                                            the role of faculty advising in science and engineering   \n",
       "\n",
       "                                            authors  \\\n",
       "0                                            qd inc   \n",
       "1                             as argon, jg hannoosh   \n",
       "2  gh hansen, ll wetterberg, h sjã¶strã¶m, o norã©n   \n",
       "3                    tm hammett, p harmon, w rhodes   \n",
       "4                                        jr cogdell   \n",
       "\n",
       "                                       venue  year  \n",
       "0                                 San Diego,     0  \n",
       "1                                 Phil. Mag,     0  \n",
       "2                 The Histochemical Journal,  1992  \n",
       "3                                        see     0  \n",
       "4  NEW DIRECTIONS FOR TEACHING AND LEARNING,  1995  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>title</th>\n      <th>authors</th>\n      <th>venue</th>\n      <th>year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>aKcZKwvwbQwJ</td>\n      <td>11578 sorrento valley road</td>\n      <td>qd inc</td>\n      <td>San Diego,</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>ixKfiTHoaDoJ</td>\n      <td>initiation of crazes in polystyrene</td>\n      <td>as argon, jg hannoosh</td>\n      <td>Phil. Mag,</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>3BxllB4wwcIJ</td>\n      <td>immunogold labelling is a quantitative method as demonstrated by studies on aminopeptidase n in</td>\n      <td>gh hansen, ll wetterberg, h sjã¶strã¶m, o norã©n</td>\n      <td>The Histochemical Journal,</td>\n      <td>1992</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>d2WWxwKMex4J</td>\n      <td>the burden of infectious disease among inmates and releasees from correctional facilities</td>\n      <td>tm hammett, p harmon, w rhodes</td>\n      <td>see</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>cZCX-AQpjccJ</td>\n      <td>the role of faculty advising in science and engineering</td>\n      <td>jr cogdell</td>\n      <td>NEW DIRECTIONS FOR TEACHING AND LEARNING,</td>\n      <td>1995</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "scholar.head()"
   ]
  },
  {
   "source": [
    "## 寻找一个候选集(Blocking)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 1. block with title/用title栏组成混合集"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "ab1 = em.AttrEquivalenceBlocker()\n",
    "C1 = ab1.block_tables(dblp, scholar, \n",
    "                      l_block_attr='title', r_block_attr='title',\n",
    "                      l_output_attrs=['title', 'authors', 'year'],\n",
    "                      r_output_attrs=['title', 'authors', 'year'])\n",
    "print(len(C1))\n",
    "C1.head()"
   ],
   "cell_type": "code",
   "metadata": {},
   "execution_count": 10,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3324\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "   _id                     ltable_id  \\\n",
       "0    0  journals/sigmod/EisenbergM02   \n",
       "1    1          conf/vldb/AmmannJR95   \n",
       "2    2         journals/sigmod/Liu02   \n",
       "3    3         journals/sigmod/Liu02   \n",
       "4    4         journals/sigmod/Liu02   \n",
       "\n",
       "                                      rtable_id  \\\n",
       "0                                  wgK6p4mDSIMJ   \n",
       "1                                  x-H7BqZ0Hw8J   \n",
       "2                                  ntqMqfgRXM4J   \n",
       "3  url:http://www.roc.noaa.gov/news/vol1is4.pdf   \n",
       "4                                  TUOBVMb4PBsJ   \n",
       "\n",
       "                                                                          ltable_title  \\\n",
       "0                                                      sql/xml is making good progress   \n",
       "1  using formal methods to reason about semantics-based decompositions of transactions   \n",
       "2                                                                       editor's notes   \n",
       "3                                                                       editor's notes   \n",
       "4                                                                       editor's notes   \n",
       "\n",
       "               ltable_authors  ltable_year  \\\n",
       "0       a eisenberg, j melton         2002   \n",
       "1  p ammann, s jajodia, i ray         1995   \n",
       "2                       l liu         2002   \n",
       "3                       l liu         2002   \n",
       "4                       l liu         2002   \n",
       "\n",
       "                                                                          rtable_title  \\\n",
       "0                                                      sql/xml is making good progress   \n",
       "1  using formal methods to reason about semantics-based decompositions of transactions   \n",
       "2                                                                       editor's notes   \n",
       "3                                                                       editor's notes   \n",
       "4                                                                       editor's notes   \n",
       "\n",
       "               rtable_authors  rtable_year  \n",
       "0       a eisenberg, j melton         2002  \n",
       "1  p ammann, s jajodia, i ray         1995  \n",
       "2                 r goldstein         1996  \n",
       "3                    nl smith         1981  \n",
       "4                   dw leslie            0  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>_id</th>\n      <th>ltable_id</th>\n      <th>rtable_id</th>\n      <th>ltable_title</th>\n      <th>ltable_authors</th>\n      <th>ltable_year</th>\n      <th>rtable_title</th>\n      <th>rtable_authors</th>\n      <th>rtable_year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>journals/sigmod/EisenbergM02</td>\n      <td>wgK6p4mDSIMJ</td>\n      <td>sql/xml is making good progress</td>\n      <td>a eisenberg, j melton</td>\n      <td>2002</td>\n      <td>sql/xml is making good progress</td>\n      <td>a eisenberg, j melton</td>\n      <td>2002</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>conf/vldb/AmmannJR95</td>\n      <td>x-H7BqZ0Hw8J</td>\n      <td>using formal methods to reason about semantics-based decompositions of transactions</td>\n      <td>p ammann, s jajodia, i ray</td>\n      <td>1995</td>\n      <td>using formal methods to reason about semantics-based decompositions of transactions</td>\n      <td>p ammann, s jajodia, i ray</td>\n      <td>1995</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>journals/sigmod/Liu02</td>\n      <td>ntqMqfgRXM4J</td>\n      <td>editor's notes</td>\n      <td>l liu</td>\n      <td>2002</td>\n      <td>editor's notes</td>\n      <td>r goldstein</td>\n      <td>1996</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>journals/sigmod/Liu02</td>\n      <td>url:http://www.roc.noaa.gov/news/vol1is4.pdf</td>\n      <td>editor's notes</td>\n      <td>l liu</td>\n      <td>2002</td>\n      <td>editor's notes</td>\n      <td>nl smith</td>\n      <td>1981</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>journals/sigmod/Liu02</td>\n      <td>TUOBVMb4PBsJ</td>\n      <td>editor's notes</td>\n      <td>l liu</td>\n      <td>2002</td>\n      <td>editor's notes</td>\n      <td>dw leslie</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 10
    }
   ]
  },
  {
   "source": [
    "### 2. block with author/用author栏组成混合集 "
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "3554\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "   _id                     ltable_id     rtable_id  \\\n",
       "0    0  journals/sigmod/EisenbergM02  e3s4OFTeBqwJ   \n",
       "1    1  journals/sigmod/EisenbergM02  -RAYAJbKLNUJ   \n",
       "2    2  journals/sigmod/EisenbergM02  6uelfg3RgEoJ   \n",
       "3    3  journals/sigmod/EisenbergM02  4mD1eFCHKKwJ   \n",
       "4    4  journals/sigmod/EisenbergM02  Z1P9QnSfDuAJ   \n",
       "\n",
       "                      ltable_title         ltable_authors  ltable_year  \\\n",
       "0  sql/xml is making good progress  a eisenberg, j melton         2002   \n",
       "1  sql/xml is making good progress  a eisenberg, j melton         2002   \n",
       "2  sql/xml is making good progress  a eisenberg, j melton         2002   \n",
       "3  sql/xml is making good progress  a eisenberg, j melton         2002   \n",
       "4  sql/xml is making good progress  a eisenberg, j melton         2002   \n",
       "\n",
       "                                                       rtable_title  \\\n",
       "0  sqlj part 1: sql routines using the java tm programming language   \n",
       "1                                             standards in practice   \n",
       "2                                           an early look at xquery   \n",
       "3                               sql standardization: the next steps   \n",
       "4                                 sql: 1999, formerly known as sql3   \n",
       "\n",
       "          rtable_authors  rtable_year  \n",
       "0  a eisenberg, j melton         1999  \n",
       "1  a eisenberg, j melton         1998  \n",
       "2  a eisenberg, j melton         2002  \n",
       "3  a eisenberg, j melton         2000  \n",
       "4  a eisenberg, j melton         1999  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>_id</th>\n      <th>ltable_id</th>\n      <th>rtable_id</th>\n      <th>ltable_title</th>\n      <th>ltable_authors</th>\n      <th>ltable_year</th>\n      <th>rtable_title</th>\n      <th>rtable_authors</th>\n      <th>rtable_year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>journals/sigmod/EisenbergM02</td>\n      <td>e3s4OFTeBqwJ</td>\n      <td>sql/xml is making good progress</td>\n      <td>a eisenberg, j melton</td>\n      <td>2002</td>\n      <td>sqlj part 1: sql routines using the java tm programming language</td>\n      <td>a eisenberg, j melton</td>\n      <td>1999</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>journals/sigmod/EisenbergM02</td>\n      <td>-RAYAJbKLNUJ</td>\n      <td>sql/xml is making good progress</td>\n      <td>a eisenberg, j melton</td>\n      <td>2002</td>\n      <td>standards in practice</td>\n      <td>a eisenberg, j melton</td>\n      <td>1998</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>journals/sigmod/EisenbergM02</td>\n      <td>6uelfg3RgEoJ</td>\n      <td>sql/xml is making good progress</td>\n      <td>a eisenberg, j melton</td>\n      <td>2002</td>\n      <td>an early look at xquery</td>\n      <td>a eisenberg, j melton</td>\n      <td>2002</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>journals/sigmod/EisenbergM02</td>\n      <td>4mD1eFCHKKwJ</td>\n      <td>sql/xml is making good progress</td>\n      <td>a eisenberg, j melton</td>\n      <td>2002</td>\n      <td>sql standardization: the next steps</td>\n      <td>a eisenberg, j melton</td>\n      <td>2000</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>journals/sigmod/EisenbergM02</td>\n      <td>Z1P9QnSfDuAJ</td>\n      <td>sql/xml is making good progress</td>\n      <td>a eisenberg, j melton</td>\n      <td>2002</td>\n      <td>sql: 1999, formerly known as sql3</td>\n      <td>a eisenberg, j melton</td>\n      <td>1999</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 11
    }
   ],
   "source": [
    "ab2 = em.AttrEquivalenceBlocker()\n",
    "C2 = ab2.block_tables(dblp, scholar,\n",
    "                      l_block_attr='authors', r_block_attr='authors',\n",
    "                      l_output_attrs=['title', 'authors', 'year'],\n",
    "                      r_output_attrs=['title', 'authors', 'year'])\n",
    "print(len(C2))\n",
    "C2.head()"
   ]
  },
  {
   "source": [
    "### 3. Overlap(重叠)\n",
    "3.1 overlap at least 6 words of title/标题至少用六个词重复"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "0% [##############################] 100% | ETA: 00:00:005634\n",
      "\n",
      "Total time elapsed: 00:00:07\n"
     ]
    }
   ],
   "source": [
    "ob3 = em.OverlapBlocker()\n",
    "C3 = ob3.block_tables(dblp, scholar, 'title', 'title',\n",
    "                      word_level=True, overlap_size=6,\n",
    "                      l_output_attrs=['title', 'authors', 'year'],\n",
    "                      r_output_attrs=['title', 'authors', 'year'],\n",
    "                      show_progress=True)\n",
    "print(len(C3))"
   ]
  },
  {
   "source": [
    "3.2 same year/年份相同"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "0% [##############################] 100% | ETA: 00:00:005634\n",
      "\n",
      "Total time elapsed: 00:00:00\n"
     ]
    }
   ],
   "source": [
    "a3 = em.AttrEquivalenceBlocker()\n",
    "c3 = a3.block_candset(C3, 'year', 'year', show_progress=True)\n",
    "print(len(C3))"
   ]
  },
  {
   "source": [
    "### 4. Overlap(重叠)\n",
    "4.1 overlap at least 5 words of authors/作者栏至少有五个单次重合"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "0% [##############################] 100% | ETA: 00:00:005958\n",
      "\n",
      "Total time elapsed: 00:00:03\n"
     ]
    }
   ],
   "source": [
    "ob4 = em.OverlapBlocker()\n",
    "C4 = ob4.block_tables(dblp, scholar, 'authors', 'authors',\n",
    "                      word_level=True, overlap_size=5,\n",
    "                      l_output_attrs=['title', 'authors', 'year'],\n",
    "                      r_output_attrs=['title', 'authors', 'year'],\n",
    "                      show_progress=True)\n",
    "print(len(C4))"
   ]
  },
  {
   "source": [
    "4.2 same year/年份相同"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "0% [##############################] 100% | ETA: 00:00:001145\n",
      "\n",
      "Total time elapsed: 00:00:00\n"
     ]
    }
   ],
   "source": [
    "a4 = em.AttrEquivalenceBlocker()\n",
    "C4 = a4.block_candset(C4, 'year', 'year',show_progress=True)\n",
    "print(len(C4))"
   ]
  },
  {
   "source": [
    "### Union results together to get final candiate set(合并之前得到的候选集合得到最终的候选集合))"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "9558\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "   _id         ltable_id     rtable_id  \\\n",
       "0    0  conf/sigmod/2000  19VIHSiMAXcJ   \n",
       "1    1  conf/sigmod/2000  2DtY9exkFcgJ   \n",
       "2    2  conf/sigmod/2000  5k-GwvznWRUJ   \n",
       "3    3  conf/sigmod/2000  5wcgt7bNx7YJ   \n",
       "4    4  conf/sigmod/2000  AYtgczYwVnYJ   \n",
       "\n",
       "                                                                                          ltable_title  \\\n",
       "0  proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...   \n",
       "1  proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...   \n",
       "2  proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...   \n",
       "3  proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...   \n",
       "4  proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...   \n",
       "\n",
       "  ltable_authors  ltable_year  \\\n",
       "0            NaN         2000   \n",
       "1            NaN         2000   \n",
       "2            NaN         2000   \n",
       "3            NaN         2000   \n",
       "4            NaN         2000   \n",
       "\n",
       "                                                                                          rtable_title  \\\n",
       "0   carnot and infosleuth-database technology and the www. acm sigmod intern. conf. on management of     \n",
       "1     umform techniques for loop optimization, in proceedings of the acm international conference on     \n",
       "2  discover: keyword search in relational databases in proceedings of the international conference ...   \n",
       "3        shoring up persistent applications, acm sigmod international conference on management of data   \n",
       "4              concept based design of data warehouses: the dwq demonstrators. in 2000 acm sigmod intl   \n",
       "\n",
       "                                   rtable_authors  rtable_year  \n",
       "0                                n jacobs, r shea            0  \n",
       "1                                          w pugh         1991  \n",
       "2                 v hristidis, y papakonstantinou            0  \n",
       "3             mj carey, dj dewitt, mj franklinâ?¦            0  \n",
       "4  m jarke, c quix, d calvanese, m lenzerini, e              0  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>_id</th>\n      <th>ltable_id</th>\n      <th>rtable_id</th>\n      <th>ltable_title</th>\n      <th>ltable_authors</th>\n      <th>ltable_year</th>\n      <th>rtable_title</th>\n      <th>rtable_authors</th>\n      <th>rtable_year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>conf/sigmod/2000</td>\n      <td>19VIHSiMAXcJ</td>\n      <td>proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>carnot and infosleuth-database technology and the www. acm sigmod intern. conf. on management of</td>\n      <td>n jacobs, r shea</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>conf/sigmod/2000</td>\n      <td>2DtY9exkFcgJ</td>\n      <td>proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>umform techniques for loop optimization, in proceedings of the acm international conference on</td>\n      <td>w pugh</td>\n      <td>1991</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>conf/sigmod/2000</td>\n      <td>5k-GwvznWRUJ</td>\n      <td>proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>discover: keyword search in relational databases in proceedings of the international conference ...</td>\n      <td>v hristidis, y papakonstantinou</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>conf/sigmod/2000</td>\n      <td>5wcgt7bNx7YJ</td>\n      <td>proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>shoring up persistent applications, acm sigmod international conference on management of data</td>\n      <td>mj carey, dj dewitt, mj franklinâ?¦</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>conf/sigmod/2000</td>\n      <td>AYtgczYwVnYJ</td>\n      <td>proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>concept based design of data warehouses: the dwq demonstrators. in 2000 acm sigmod intl</td>\n      <td>m jarke, c quix, d calvanese, m lenzerini, e</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 16
    }
   ],
   "source": [
    "G = em.combine_blocker_outputs_via_union([C1, C2, C3, C4])\n",
    "print(len(G))\n",
    "\n",
    "G.head()"
   ]
  },
  {
   "source": [
    "### Run debugger to make sure not missing any valid matches(测试是否遗漏匹配)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "e for a web-site management system   \n",
       "15                                          streaming queries over streaming data   \n",
       "16                                          strudel: a web-site management system   \n",
       "17                                              schema mapping as query discovery   \n",
       "18                                      retrieval of composite multimedia objects   \n",
       "19                                                tree pattern query minimization   \n",
       "20                               learning to match ontologies on the semantic web   \n",
       "21                                          strudel: a web-site management system   \n",
       "22                                   probabilistic temporal databases, i: algebra   \n",
       "23                   irisnet: an architecture for internet-scale sensing services   \n",
       "24                                  the multidimensional database system rasdaman   \n",
       "25                                                      sequence query processing   \n",
       "26                                     building knowledge base management systems   \n",
       "27                     declarative data cleaning: language, model, and algorithms   \n",
       "28                                           a fast index for semistructured data   \n",
       "29                      active storage for large-scale data mining and multimedia   \n",
       "30                                          domains, relations and religious wars   \n",
       "31                                       disseminating updates on broadcast disks   \n",
       "32                      tutorial: application servers and associated technologies   \n",
       "33                                                  dissemination of dynamic data   \n",
       "34                                   probabilistic temporal databases, i: algebra   \n",
       "35                                          semantic data caching and replacement   \n",
       "36                                                   data warehouse configuration   \n",
       "37                                              object fusion in mediator systems   \n",
       "38                a conceptual architecture for semantic web enabled web services   \n",
       "39                                          computing iceberg queries efficiently   \n",
       "40                     irisnet: internet-scale resource-intensive sensor services   \n",
       "41                                  efficient view maintenance at data warehouses   \n",
       "42                                        the mlpq/gis constraint database system   \n",
       "43                                             a framework for semantic gossiping   \n",
       "44                       mindreader: querying databases through multiple examples   \n",
       "45                                 cache-and-query for wide area sensor databases   \n",
       "46                                          agora: living with xml and relational   \n",
       "47                                                            the third manifesto   \n",
       "48                                  an asymptotically optimal multiversion b-tree   \n",
       "49                                             the oracle universal server buffer   \n",
       "\n",
       "                                                             ltable_authors  \\\n",
       "0                 j clifford, c dyreson, t isakowitz, c jensen, r snodgrass   \n",
       "1   j hammer, h garcia-molina, s nestorov, r yerneni, m breunig, v vassalos   \n",
       "2                                     h jagadish, n koudas, s muthukrishnan   \n",
       "3                           r sawai, m tsukamoto, y loh, t terada, s nishio   \n",
       "4                                     d agrawal, a abbadi, a singh, t yurek   \n",
       "5                                           j madhavan, p bernstein, e rahm   \n",
       "6                          m fernandez, d florescu, j kang, a levy, d suciu   \n",
       "7                              s acharya, p gibbons, v poosala, s ramaswamy   \n",
       "8                                     l haas, d kossmann, e wimmers, j yang   \n",
       "9                                  s madden, m shah, j hellerstein, v raman   \n",
       "10                                                   y ioannidis, v poosala   \n",
       "11                                                      d lomet, b salzberg   \n",
       "12                                  m stillger, g lohman, v markl, m kandil   \n",
       "13                             s acharya, p gibbons, v poosala, s ramaswamy   \n",
       "14                                 m fernandez, d florescu, a levy, d suciu   \n",
       "15                                             s chandrasekaran, m franklin   \n",
       "16                         m fernandez, d florescu, j kang, a levy, d suciu   \n",
       "17                                          r miller, l haas, m hernï¿½ndez   \n",
       "18                                s chaudhuri, s ghandeharizadeh, c shahabi   \n",
       "19                          s amer-yahia, s cho, l lakshmanan, d srivastava   \n",
       "20                    a doan, j madhavan, r dhamankar, p domingos, a halevy   \n",
       "21                         m fernandez, d florescu, j kang, a levy, d suciu   \n",
       "22                                       a dekhtyar, r ross, v subrahmanian   \n",
       "23                   s nath, a deshpande, y ke, p gibbons, b karp, s seshan   \n",
       "24                      p baumann, a dehmel, p furtado, r ritsch, n widmann   \n",
       "25                                      p seshadri, m livny, r ramakrishnan   \n",
       "26          j mylopoulos, v chaudhri, d plexousakis, a shrufi, t topaloglou   \n",
       "27                      h galhardas, d florescu, d shasha, e simon, c saita   \n",
       "28                   b cooper, n sample, m franklin, g hjaltason, m shadmon   \n",
       "29                                          e riedel, g gibson, c faloutsos   \n",
       "30                                                                  r camps   \n",
       "31                                          s acharya, m franklin, s zdonik   \n",
       "32                                                                      NaN   \n",
       "33              p deolasee, a katkar, a panchbudhe, k ramamritham, p shenoy   \n",
       "34                                       a dekhtyar, r ross, v subrahmanian   \n",
       "35                      s dar, m franklin, b jï¿½nsson, d srivastava, m tan   \n",
       "36                                                  d theodoratos, t sellis   \n",
       "37                         y papakonstantinou, s abiteboul, h garcia-molina   \n",
       "38                                           c bussler, d fensel, a maedche   \n",
       "39               m fang, n shivakumar, h garcia-molina, r motwani, j ullman   \n",
       "40                                 a deshpande, s nath, p gibbons, s seshan   \n",
       "41                                    d agrawal, a abbadi, a singh, t yurek   \n",
       "42                       p revesz, r chen, p kanjamala, y li, y liu, y wang   \n",
       "43                                 k aberer, p cudrï¿½-mauroux, m hauswirth   \n",
       "44                                    y ishikawa, r subramanya, c faloutsos   \n",
       "45                                 a deshpande, s nath, p gibbons, s seshan   \n",
       "46                i manolescu, d florescu, d kossmann, f xhumari, d olteanu   \n",
       "47                                                         h darwen, c date   \n",
       "48                      b becker, s gschwind, t ohler, b seeger, p widmayer   \n",
       "49            w bridge, a joshi, m keihl, t lahiri, j loaiza, n macnaughton   \n",
       "\n",
       "                 ltable_venue  \\\n",
       "0   ACM Trans. Database Syst.   \n",
       "1           SIGMOD Conference   \n",
       "2                        VLDB   \n",
       "3                        VLDB   \n",
       "4           SIGMOD Conference   \n",
       "5                        VLDB   \n",
       "6           SIGMOD Conference   \n",
       "7           SIGMOD Conference   \n",
       "8                        VLDB   \n",
       "9           SIGMOD Conference   \n",
       "10                       VLDB   \n",
       "11                    VLDB J.   \n",
       "12                       VLDB   \n",
       "13          SIGMOD Conference   \n",
       "14              SIGMOD Record   \n",
       "15                       VLDB   \n",
       "16          SIGMOD Conference   \n",
       "17                       VLDB   \n",
       "18                       VLDB   \n",
       "19                    VLDB J.   \n",
       "20                    VLDB J.   \n",
       "21          SIGMOD Conference   \n",
       "22  ACM Trans. Database Syst.   \n",
       "23                       VLDB   \n",
       "24          SIGMOD Conference   \n",
       "25          SIGMOD Conference   \n",
       "26                    VLDB J.   \n",
       "27                       VLDB   \n",
       "28                       VLDB   \n",
       "29                       VLDB   \n",
       "30              SIGMOD Record   \n",
       "31                       VLDB   \n",
       "32                        NaN   \n",
       "33          SIGMOD Conference   \n",
       "34  ACM Trans. Database Syst.   \n",
       "35                       VLDB   \n",
       "36                       VLDB   \n",
       "37                       VLDB   \n",
       "38              SIGMOD Record   \n",
       "39                       VLDB   \n",
       "40          SIGMOD Conference   \n",
       "41          SIGMOD Conference   \n",
       "42          SIGMOD Conference   \n",
       "43              SIGMOD Record   \n",
       "44                       VLDB   \n",
       "45          SIGMOD Conference   \n",
       "46                       VLDB   \n",
       "47              SIGMOD Record   \n",
       "48                    VLDB J.   \n",
       "49                       VLDB   \n",
       "\n",
       "                                                                        rtable_title  \\\n",
       "0                                                                on the semantics of   \n",
       "1                                  template-based wrappers in the tsimmis experience   \n",
       "2                                           mining deviants in time series databases   \n",
       "3                                  on functional properties of information filtering   \n",
       "4                                              efficient view maintenance warehouses   \n",
       "5                                           generic schema matching with cupid. 2001   \n",
       "6                                              strudel: a web-site management system   \n",
       "7                              join synopses for improving approximate query answers   \n",
       "8                                          optimizing queries across diverse sources   \n",
       "9                                           continuously adaptive continuous queries   \n",
       "10   histogram-based approximation of set-valued query-answers, in'the vldb journal'   \n",
       "11                                                          recovery for index trees   \n",
       "12                                                 leo å? db2â??s learning optimizer   \n",
       "13                                      join synopsesfor approximate query answering   \n",
       "14                                                   a query language for a web-site   \n",
       "15                                          â??streaming queries over streaming data   \n",
       "16      catching the boat with strudel: experience with a web-site management system   \n",
       "17                                          andez. schema mapping as query discovery   \n",
       "18                                avoiding retrieval of composite multimedia objects   \n",
       "19                                              minimization of tree pattern queries   \n",
       "20                                                      learning to match ontologies   \n",
       "21                                 a query language for a web-site management system   \n",
       "22                                 probabilistic temporal databases, part i: algebra   \n",
       "23  irisnet: an architecture for compute-intensive wide-area sensor network services   \n",
       "24                     the rasdaman approach to multidimensional database management   \n",
       "25                                                    sequence query processing proc   \n",
       "26                     building knowledge base management systems: a progress report   \n",
       "27                                        declarative data cleaning: language, model   \n",
       "28                                          a parallel index for semistructured data   \n",
       "29                                          active disks for large-scale data mining   \n",
       "30                                      &quot; domains, relations and religious wars   \n",
       "31                                s. zdonik disseminating updates on broadcast disks   \n",
       "32                                   application servers and associated technologies   \n",
       "33                                     dissemination of dynamic data on the internet   \n",
       "34                                      temporal probabilistic databases, i: algebra   \n",
       "35                             semantic data caching and replacement. 22nd vldb conf   \n",
       "36                                           data warehouse configuration algorithms   \n",
       "37                              object fusion in mediator systems (extended version)   \n",
       "38                                                 semantic web enabled web services   \n",
       "39                                               computing iceberg queries e ciently   \n",
       "40                           iris: internet-scale resource-intensive sensor services   \n",
       "41                                        efficient data view maintenance warehouses   \n",
       "42                                                  the mlpq/gis constraint database   \n",
       "43                                   m. hauswirth a framework for semantic gossiping   \n",
       "44                                            mindreader: querying databases through   \n",
       "45                                      cache-and-query for wide area sensor network   \n",
       "46                                                    living with xml and relational   \n",
       "47                                             the third manifesto acm sigmod record   \n",
       "48                                    ban asymptotically optimal multiversion b-tree   \n",
       "49                                                  the oracle universal server buer   \n",
       "\n",
       "                                         rtable_authors  \\\n",
       "0    j clifford, c dyreson, t isakowitz, cs jensen, r     \n",
       "1   j hammer, h garcia-molina, s nestorov, r yerneni,     \n",
       "2                  h jagdish, n koudas, s muthukrishnan   \n",
       "3      r sawai, m tsukamoto, t terada, yh loh, s nishio   \n",
       "4              d agrawal, a el abbadi, a singh, t yurek   \n",
       "5                      j madhavan, pa bernstein, e rahm   \n",
       "6                  m fernandez, j kang, a levy, d suciu   \n",
       "7         s acharya, p gibbons, v poosala, s ramaswarmy   \n",
       "8                  l haas, d kossmann, e wimmers, j yan   \n",
       "9             s madden, m shah, jm hellerstein, v raman   \n",
       "10                              ye ioannidis, v poosala   \n",
       "11                              ds lomet, b concurrency   \n",
       "12             m stillger, gm lohman, v markl, m kandil   \n",
       "13        s acharya, pb gibbons, v poosala, s ramaswamy   \n",
       "14                                      a levy, d suciu   \n",
       "15                           s chandrasekan, m franklin   \n",
       "16     d florescu, m fernandez, j kang, a levy, d suciu   \n",
       "17                             r miller, l haas, m hern   \n",
       "18            s chaudhuri, s ghandeharizadeh, s shahabi   \n",
       "19    s amer-yahia, s cho, lvs lakshmanan, d srivastava   \n",
       "20    a doan, j madhavan, r dhamankar, p domingos, ay     \n",
       "21             d florescu, a levy, m fernandez, d suciu   \n",
       "22                  a dekhtyar, r ross, vs subrahmanian   \n",
       "23   s nath, a deshpande, y ke, pb gibbons, b karp, s     \n",
       "24            p baumann, p furtado, r ritsch, n widmann   \n",
       "25                 p sheshadri, m livny, r ramakrishnan   \n",
       "26         j mylopoulos, v chaudhri, d plexousakis, t     \n",
       "27      h galhardas, d florescu, d shasha, e simon, c     \n",
       "28                        b cooper, n sample, m shadmon   \n",
       "29             e riedel, g gibson, a moore, c faloutsos   \n",
       "30                                     r wars, s record   \n",
       "31                                s acharya, m franklin   \n",
       "32                                              c mohan   \n",
       "33  k ramamritham, p deolasee, a katkar, a panchbudhe,    \n",
       "34                  a dekhtyar, r ross, vs subrahmanian   \n",
       "35    s dar, m franklin, b jonsson, d srivastava, m tan   \n",
       "36            s ligoudistianos, d theodoratos, t sellis   \n",
       "37                  y papakonstantinou, h garcia-molina   \n",
       "38                       d fensel, c bussler, a maedche   \n",
       "39  m fang, n shivakumar, h garcia-molina, r motwani,     \n",
       "40            a deshpande, s nath, pb gibbons, s seshan   \n",
       "41             d agrawal, a el abbadi, a singh, t yurek   \n",
       "42     pz revesz, r chen, p kanjamala, y li, y liu, y     \n",
       "43                           k aberer, p cudrã©-mauroux   \n",
       "44     y ishikawa, r subramanya, ps center, c faloutsos   \n",
       "45            a deshpande, s nath, pb gibbons, s seshan   \n",
       "46  f manolescu, d florescu, d kossmann, f xhumari, d     \n",
       "47                                    h darwen, cj date   \n",
       "48         b becker, s gschwind, t ohler, b seeger, p     \n",
       "49  w bridge, a joshi, m keihl, t lahiri, j loaiza, n     \n",
       "\n",
       "                                                           rtable_venue  \n",
       "0         Nowâ?? in databases,â?? ACM Transactions on Database Systems,  \n",
       "1            Proceedings of the ACM SIGMOD International Conference on   \n",
       "2                                                           Proc. VLDB,  \n",
       "3         Electronics and Communications in Japan(Part II Electronics),  \n",
       "4   Proc. Of the 1997 ACM SIGMOD International Conference on  &hellip;,  \n",
       "5                                                             Microsoft  \n",
       "6                                                                   NaN  \n",
       "7                                             Proc. of ACM SIGMOD Conf,  \n",
       "8                 &hellip;  of the 23rd VLDB Conference, Athens, Greece  \n",
       "9                  Proc. of the ACM SIGMOD International Conference on   \n",
       "10                                                                  NaN  \n",
       "11                                                        VLDB Journal,  \n",
       "12                                                    The VLDB Journal,  \n",
       "13                                                                  ACM  \n",
       "14                                                       SIGMOD Record,  \n",
       "15                                                                  NaN  \n",
       "16                Proceedings of ACM SIGMOD Conference on Management of  \n",
       "17                                                       Proc. of VLDB,  \n",
       "18                                                       Proc. of VLDB,  \n",
       "19                                                                  NaN  \n",
       "20                                                        VLDB Journal,  \n",
       "21                                                       SIGMOD Record,  \n",
       "22                                ACM Transactions on Database Systems,  \n",
       "23                                                                  NaN  \n",
       "24                                                                  NaN  \n",
       "25                                                     ACM SIGMOD Conf,  \n",
       "26                                                                  NaN  \n",
       "27                             and Algorithms. Technical report, INRIA,  \n",
       "28                                                                  NaN  \n",
       "29                                                                  NaN  \n",
       "30                                                       SIGMOD Record,  \n",
       "31                                                 22nd VLDB Conference  \n",
       "32                                                                  NaN  \n",
       "33                                                                  NaN  \n",
       "34                                ACM Transactions on Database Systems,  \n",
       "35                                                       Bombay, India,  \n",
       "36                                                                  NaN  \n",
       "37                                                                  NaN  \n",
       "38                                                                  NaN  \n",
       "39                                                                  NaN  \n",
       "40                                                                  NaN  \n",
       "41                                                    Proc. ACM SIGMOD,  \n",
       "42                                          Proceedings of SIGMOD 2000,  \n",
       "43                                                               SIGMOD  \n",
       "44                                                                  NaN  \n",
       "45                                                   SIGMOD Conference,  \n",
       "46                                               Proc. VLDB conference,  \n",
       "47                                                                  NaN  \n",
       "48                                                             The VLDB  \n",
       "49                                                                  NaN  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>_id</th>\n      <th>ltable_id</th>\n      <th>rtable_id</th>\n      <th>ltable_title</th>\n      <th>ltable_authors</th>\n      <th>ltable_venue</th>\n      <th>rtable_title</th>\n      <th>rtable_authors</th>\n      <th>rtable_venue</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>journals/tods/CliffordDIJS97</td>\n      <td>sflWMxkOH2cJ</td>\n      <td>on the semantics of ``now'' in databases</td>\n      <td>j clifford, c dyreson, t isakowitz, c jensen, r snodgrass</td>\n      <td>ACM Trans. Database Syst.</td>\n      <td>on the semantics of</td>\n      <td>j clifford, c dyreson, t isakowitz, cs jensen, r</td>\n      <td>Nowâ?? in databases,â?? ACM Transactions on Database Systems,</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>conf/sigmod/HammerGNYBV97</td>\n      <td>2rysKgS6lugJ</td>\n      <td>template-based wrappers in the tsimmis system</td>\n      <td>j hammer, h garcia-molina, s nestorov, r yerneni, m breunig, v vassalos</td>\n      <td>SIGMOD Conference</td>\n      <td>template-based wrappers in the tsimmis experience</td>\n      <td>j hammer, h garcia-molina, s nestorov, r yerneni,</td>\n      <td>Proceedings of the ACM SIGMOD International Conference on</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>conf/vldb/KoudasMJ99</td>\n      <td>wzae2lO15U8J</td>\n      <td>mining deviants in a time series database</td>\n      <td>h jagadish, n koudas, s muthukrishnan</td>\n      <td>VLDB</td>\n      <td>mining deviants in time series databases</td>\n      <td>h jagdish, n koudas, s muthukrishnan</td>\n      <td>Proc. VLDB,</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>conf/vldb/NishioSTTL01</td>\n      <td>S30IXg4ClFIJ</td>\n      <td>functional properties of information filtering</td>\n      <td>r sawai, m tsukamoto, y loh, t terada, s nishio</td>\n      <td>VLDB</td>\n      <td>on functional properties of information filtering</td>\n      <td>r sawai, m tsukamoto, t terada, yh loh, s nishio</td>\n      <td>Electronics and Communications in Japan(Part II Electronics),</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>conf/sigmod/AgrawalASY97</td>\n      <td>7Of1HyK-VnkJ</td>\n      <td>efficient view maintenance at data warehouses</td>\n      <td>d agrawal, a abbadi, a singh, t yurek</td>\n      <td>SIGMOD Conference</td>\n      <td>efficient view maintenance warehouses</td>\n      <td>d agrawal, a el abbadi, a singh, t yurek</td>\n      <td>Proc. Of the 1997 ACM SIGMOD International Conference on  &amp;hellip;,</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>5</td>\n      <td>conf/vldb/MadhavanBR01</td>\n      <td>66dWDpWjbbwJ</td>\n      <td>generic schema matching with cupid</td>\n      <td>j madhavan, p bernstein, e rahm</td>\n      <td>VLDB</td>\n      <td>generic schema matching with cupid. 2001</td>\n      <td>j madhavan, pa bernstein, e rahm</td>\n      <td>Microsoft</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>6</td>\n      <td>conf/sigmod/FernandezFKLS98</td>\n      <td>ek26aiEheesJ</td>\n      <td>catching the boat with strudel: experiences with a web-site management system</td>\n      <td>m fernandez, d florescu, j kang, a levy, d suciu</td>\n      <td>SIGMOD Conference</td>\n      <td>strudel: a web-site management system</td>\n      <td>m fernandez, j kang, a levy, d suciu</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>7</td>\n      <td>conf/sigmod/AcharyaGPR99a</td>\n      <td>ahfoveFQC2sJ</td>\n      <td>join synopses for approximate query answering</td>\n      <td>s acharya, p gibbons, v poosala, s ramaswamy</td>\n      <td>SIGMOD Conference</td>\n      <td>join synopses for improving approximate query answers</td>\n      <td>s acharya, p gibbons, v poosala, s ramaswarmy</td>\n      <td>Proc. of ACM SIGMOD Conf,</td>\n    </tr>\n    <tr>\n      <th>8</th>\n      <td>8</td>\n      <td>conf/vldb/HaasKWY97</td>\n      <td>yIRQRtTEzSAJ</td>\n      <td>optimizing queries across diverse data sources</td>\n      <td>l haas, d kossmann, e wimmers, j yang</td>\n      <td>VLDB</td>\n      <td>optimizing queries across diverse sources</td>\n      <td>l haas, d kossmann, e wimmers, j yan</td>\n      <td>&amp;hellip;  of the 23rd VLDB Conference, Athens, Greece</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>9</td>\n      <td>conf/sigmod/MaddenSHR02</td>\n      <td>kjtuCATx_uoJ</td>\n      <td>continuously adaptive continuous queries over streams</td>\n      <td>s madden, m shah, j hellerstein, v raman</td>\n      <td>SIGMOD Conference</td>\n      <td>continuously adaptive continuous queries</td>\n      <td>s madden, m shah, jm hellerstein, v raman</td>\n      <td>Proc. of the ACM SIGMOD International Conference on</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>10</td>\n      <td>conf/vldb/IoannidisP99</td>\n      <td>BlALg9KQCLAJ</td>\n      <td>histogram-based approximation of set-valued query-answers</td>\n      <td>y ioannidis, v poosala</td>\n      <td>VLDB</td>\n      <td>histogram-based approximation of set-valued query-answers, in'the vldb journal'</td>\n      <td>ye ioannidis, v poosala</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>11</td>\n      <td>journals/vldb/LometS97</td>\n      <td>5EjrRrS54fsJ</td>\n      <td>concurrency and recovery for index trees</td>\n      <td>d lomet, b salzberg</td>\n      <td>VLDB J.</td>\n      <td>recovery for index trees</td>\n      <td>ds lomet, b concurrency</td>\n      <td>VLDB Journal,</td>\n    </tr>\n    <tr>\n      <th>12</th>\n      <td>12</td>\n      <td>conf/vldb/StillgerLMK01</td>\n      <td>R43ssyRp674J</td>\n      <td>leo - db2's learning optimizer</td>\n      <td>m stillger, g lohman, v markl, m kandil</td>\n      <td>VLDB</td>\n      <td>leo å? db2â??s learning optimizer</td>\n      <td>m stillger, gm lohman, v markl, m kandil</td>\n      <td>The VLDB Journal,</td>\n    </tr>\n    <tr>\n      <th>13</th>\n      <td>13</td>\n      <td>conf/sigmod/AcharyaGPR99a</td>\n      <td>czJmrc_4tZ4J</td>\n      <td>join synopses for approximate query answering</td>\n      <td>s acharya, p gibbons, v poosala, s ramaswamy</td>\n      <td>SIGMOD Conference</td>\n      <td>join synopsesfor approximate query answering</td>\n      <td>s acharya, pb gibbons, v poosala, s ramaswamy</td>\n      <td>ACM</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>14</td>\n      <td>journals/sigmod/FernandezFLS97</td>\n      <td>url:http://portal.acm.org/ft_gateway.cfm%3Fid%3D262763%26type%3Dpdf%26dl%3Dportal%26dl%3DACM%26C...</td>\n      <td>a query language for a web-site management system</td>\n      <td>m fernandez, d florescu, a levy, d suciu</td>\n      <td>SIGMOD Record</td>\n      <td>a query language for a web-site</td>\n      <td>a levy, d suciu</td>\n      <td>SIGMOD Record,</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>15</td>\n      <td>conf/vldb/ChandrasekaranF02</td>\n      <td>b408sdGcY3UJ</td>\n      <td>streaming queries over streaming data</td>\n      <td>s chandrasekaran, m franklin</td>\n      <td>VLDB</td>\n      <td>â??streaming queries over streaming data</td>\n      <td>s chandrasekan, m franklin</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>16</td>\n      <td>conf/sigmod/FernandezFKLS97</td>\n      <td>N9NHi5y3K38J</td>\n      <td>strudel: a web-site management system</td>\n      <td>m fernandez, d florescu, j kang, a levy, d suciu</td>\n      <td>SIGMOD Conference</td>\n      <td>catching the boat with strudel: experience with a web-site management system</td>\n      <td>d florescu, m fernandez, j kang, a levy, d suciu</td>\n      <td>Proceedings of ACM SIGMOD Conference on Management of</td>\n    </tr>\n    <tr>\n      <th>17</th>\n      <td>17</td>\n      <td>conf/vldb/MillerHH00</td>\n      <td>kfTc_TsqiP8J</td>\n      <td>schema mapping as query discovery</td>\n      <td>r miller, l haas, m hernï¿½ndez</td>\n      <td>VLDB</td>\n      <td>andez. schema mapping as query discovery</td>\n      <td>r miller, l haas, m hern</td>\n      <td>Proc. of VLDB,</td>\n    </tr>\n    <tr>\n      <th>18</th>\n      <td>18</td>\n      <td>conf/vldb/ChaudhuriGS95</td>\n      <td>WYgQ_Mhdp9sJ</td>\n      <td>retrieval of composite multimedia objects</td>\n      <td>s chaudhuri, s ghandeharizadeh, c shahabi</td>\n      <td>VLDB</td>\n      <td>avoiding retrieval of composite multimedia objects</td>\n      <td>s chaudhuri, s ghandeharizadeh, s shahabi</td>\n      <td>Proc. of VLDB,</td>\n    </tr>\n    <tr>\n      <th>19</th>\n      <td>19</td>\n      <td>journals/vldb/Amer-YahiaCLS02</td>\n      <td>DFubU20j1OIJ</td>\n      <td>tree pattern query minimization</td>\n      <td>s amer-yahia, s cho, l lakshmanan, d srivastava</td>\n      <td>VLDB J.</td>\n      <td>minimization of tree pattern queries</td>\n      <td>s amer-yahia, s cho, lvs lakshmanan, d srivastava</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>20</th>\n      <td>20</td>\n      <td>journals/vldb/DoanMDDH03</td>\n      <td>InzqepkG0W8J</td>\n      <td>learning to match ontologies on the semantic web</td>\n      <td>a doan, j madhavan, r dhamankar, p domingos, a halevy</td>\n      <td>VLDB J.</td>\n      <td>learning to match ontologies</td>\n      <td>a doan, j madhavan, r dhamankar, p domingos, ay</td>\n      <td>VLDB Journal,</td>\n    </tr>\n    <tr>\n      <th>21</th>\n      <td>21</td>\n      <td>conf/sigmod/FernandezFKLS97</td>\n      <td>9NlVdFCl2ysJ</td>\n      <td>strudel: a web-site management system</td>\n      <td>m fernandez, d florescu, j kang, a levy, d suciu</td>\n      <td>SIGMOD Conference</td>\n      <td>a query language for a web-site management system</td>\n      <td>d florescu, a levy, m fernandez, d suciu</td>\n      <td>SIGMOD Record,</td>\n    </tr>\n    <tr>\n      <th>22</th>\n      <td>22</td>\n      <td>journals/tods/DekhtyarRS01</td>\n      <td>Bho8kTicu-8J</td>\n      <td>probabilistic temporal databases, i: algebra</td>\n      <td>a dekhtyar, r ross, v subrahmanian</td>\n      <td>ACM Trans. Database Syst.</td>\n      <td>probabilistic temporal databases, part i: algebra</td>\n      <td>a dekhtyar, r ross, vs subrahmanian</td>\n      <td>ACM Transactions on Database Systems,</td>\n    </tr>\n    <tr>\n      <th>23</th>\n      <td>23</td>\n      <td>conf/vldb/NathDKGKS03</td>\n      <td>aP_cBgCHoGYJ</td>\n      <td>irisnet: an architecture for internet-scale sensing services</td>\n      <td>s nath, a deshpande, y ke, p gibbons, b karp, s seshan</td>\n      <td>VLDB</td>\n      <td>irisnet: an architecture for compute-intensive wide-area sensor network services</td>\n      <td>s nath, a deshpande, y ke, pb gibbons, b karp, s</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>24</th>\n      <td>24</td>\n      <td>conf/sigmod/BaumannDFRW98</td>\n      <td>7rJ35WBjoOAJ</td>\n      <td>the multidimensional database system rasdaman</td>\n      <td>p baumann, a dehmel, p furtado, r ritsch, n widmann</td>\n      <td>SIGMOD Conference</td>\n      <td>the rasdaman approach to multidimensional database management</td>\n      <td>p baumann, p furtado, r ritsch, n widmann</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>25</th>\n      <td>25</td>\n      <td>conf/sigmod/SeshadriLR94</td>\n      <td>G4N8qaBAG4kJ</td>\n      <td>sequence query processing</td>\n      <td>p seshadri, m livny, r ramakrishnan</td>\n      <td>SIGMOD Conference</td>\n      <td>sequence query processing proc</td>\n      <td>p sheshadri, m livny, r ramakrishnan</td>\n      <td>ACM SIGMOD Conf,</td>\n    </tr>\n    <tr>\n      <th>26</th>\n      <td>26</td>\n      <td>journals/vldb/MylopoulosCPST96</td>\n      <td>M8Eh286-D0wJ</td>\n      <td>building knowledge base management systems</td>\n      <td>j mylopoulos, v chaudhri, d plexousakis, a shrufi, t topaloglou</td>\n      <td>VLDB J.</td>\n      <td>building knowledge base management systems: a progress report</td>\n      <td>j mylopoulos, v chaudhri, d plexousakis, t</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>27</th>\n      <td>27</td>\n      <td>conf/vldb/GalhardasFSSS01</td>\n      <td>otGW0G5B6AYJ</td>\n      <td>declarative data cleaning: language, model, and algorithms</td>\n      <td>h galhardas, d florescu, d shasha, e simon, c saita</td>\n      <td>VLDB</td>\n      <td>declarative data cleaning: language, model</td>\n      <td>h galhardas, d florescu, d shasha, e simon, c</td>\n      <td>and Algorithms. Technical report, INRIA,</td>\n    </tr>\n    <tr>\n      <th>28</th>\n      <td>28</td>\n      <td>conf/vldb/CooperSFHS01</td>\n      <td>R63Nez_SNlAJ</td>\n      <td>a fast index for semistructured data</td>\n      <td>b cooper, n sample, m franklin, g hjaltason, m shadmon</td>\n      <td>VLDB</td>\n      <td>a parallel index for semistructured data</td>\n      <td>b cooper, n sample, m shadmon</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>29</th>\n      <td>29</td>\n      <td>conf/vldb/RiedelGF98</td>\n      <td>VyG3DtMwR-wJ</td>\n      <td>active storage for large-scale data mining and multimedia</td>\n      <td>e riedel, g gibson, c faloutsos</td>\n      <td>VLDB</td>\n      <td>active disks for large-scale data mining</td>\n      <td>e riedel, g gibson, a moore, c faloutsos</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>30</th>\n      <td>30</td>\n      <td>journals/sigmod/Camps96</td>\n      <td>url:http://portal.acm.org/ft_gateway.cfm%3Fid%3D245885%26type%3Dpdf%26dl%3DGUIDE%26dl%3DACM%26CF...</td>\n      <td>domains, relations and religious wars</td>\n      <td>r camps</td>\n      <td>SIGMOD Record</td>\n      <td>&amp;quot; domains, relations and religious wars</td>\n      <td>r wars, s record</td>\n      <td>SIGMOD Record,</td>\n    </tr>\n    <tr>\n      <th>31</th>\n      <td>31</td>\n      <td>conf/vldb/AcharyaFZ96</td>\n      <td>qTCsXcCsI7UJ</td>\n      <td>disseminating updates on broadcast disks</td>\n      <td>s acharya, m franklin, s zdonik</td>\n      <td>VLDB</td>\n      <td>s. zdonik disseminating updates on broadcast disks</td>\n      <td>s acharya, m franklin</td>\n      <td>22nd VLDB Conference</td>\n    </tr>\n    <tr>\n      <th>32</th>\n      <td>32</td>\n      <td>conf/sigmod/Mohan02</td>\n      <td>YJlUtEk7zwkJ</td>\n      <td>tutorial: application servers and associated technologies</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>application servers and associated technologies</td>\n      <td>c mohan</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>33</th>\n      <td>33</td>\n      <td>conf/sigmod/DeolaseeKPRS01</td>\n      <td>tEFUZn3tEMYJ</td>\n      <td>dissemination of dynamic data</td>\n      <td>p deolasee, a katkar, a panchbudhe, k ramamritham, p shenoy</td>\n      <td>SIGMOD Conference</td>\n      <td>dissemination of dynamic data on the internet</td>\n      <td>k ramamritham, p deolasee, a katkar, a panchbudhe,</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>34</th>\n      <td>34</td>\n      <td>journals/tods/DekhtyarRS01</td>\n      <td>ywEAyex_frMJ</td>\n      <td>probabilistic temporal databases, i: algebra</td>\n      <td>a dekhtyar, r ross, v subrahmanian</td>\n      <td>ACM Trans. Database Syst.</td>\n      <td>temporal probabilistic databases, i: algebra</td>\n      <td>a dekhtyar, r ross, vs subrahmanian</td>\n      <td>ACM Transactions on Database Systems,</td>\n    </tr>\n    <tr>\n      <th>35</th>\n      <td>35</td>\n      <td>conf/vldb/DarFJST96</td>\n      <td>QdWd_yg3YGMJ</td>\n      <td>semantic data caching and replacement</td>\n      <td>s dar, m franklin, b jï¿½nsson, d srivastava, m tan</td>\n      <td>VLDB</td>\n      <td>semantic data caching and replacement. 22nd vldb conf</td>\n      <td>s dar, m franklin, b jonsson, d srivastava, m tan</td>\n      <td>Bombay, India,</td>\n    </tr>\n    <tr>\n      <th>36</th>\n      <td>36</td>\n      <td>conf/vldb/TheodoratosS97</td>\n      <td>CuNBKuE8AH4J</td>\n      <td>data warehouse configuration</td>\n      <td>d theodoratos, t sellis</td>\n      <td>VLDB</td>\n      <td>data warehouse configuration algorithms</td>\n      <td>s ligoudistianos, d theodoratos, t sellis</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>37</th>\n      <td>37</td>\n      <td>conf/vldb/PapakonstantinouAG96</td>\n      <td>BIF2jRwmL0EJ</td>\n      <td>object fusion in mediator systems</td>\n      <td>y papakonstantinou, s abiteboul, h garcia-molina</td>\n      <td>VLDB</td>\n      <td>object fusion in mediator systems (extended version)</td>\n      <td>y papakonstantinou, h garcia-molina</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>38</th>\n      <td>38</td>\n      <td>journals/sigmod/BusslerFM02</td>\n      <td>C0rxF4CLEmcJ</td>\n      <td>a conceptual architecture for semantic web enabled web services</td>\n      <td>c bussler, d fensel, a maedche</td>\n      <td>SIGMOD Record</td>\n      <td>semantic web enabled web services</td>\n      <td>d fensel, c bussler, a maedche</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>39</th>\n      <td>39</td>\n      <td>conf/vldb/FangSGMU98</td>\n      <td>F8EAXPxQQsIJ</td>\n      <td>computing iceberg queries efficiently</td>\n      <td>m fang, n shivakumar, h garcia-molina, r motwani, j ullman</td>\n      <td>VLDB</td>\n      <td>computing iceberg queries e ciently</td>\n      <td>m fang, n shivakumar, h garcia-molina, r motwani,</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>40</th>\n      <td>40</td>\n      <td>conf/sigmod/DeshpandeNGS03a</td>\n      <td>Thoke4oYLt4J</td>\n      <td>irisnet: internet-scale resource-intensive sensor services</td>\n      <td>a deshpande, s nath, p gibbons, s seshan</td>\n      <td>SIGMOD Conference</td>\n      <td>iris: internet-scale resource-intensive sensor services</td>\n      <td>a deshpande, s nath, pb gibbons, s seshan</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>41</th>\n      <td>41</td>\n      <td>conf/sigmod/AgrawalASY97</td>\n      <td>-XBy73K2jq0J</td>\n      <td>efficient view maintenance at data warehouses</td>\n      <td>d agrawal, a abbadi, a singh, t yurek</td>\n      <td>SIGMOD Conference</td>\n      <td>efficient data view maintenance warehouses</td>\n      <td>d agrawal, a el abbadi, a singh, t yurek</td>\n      <td>Proc. ACM SIGMOD,</td>\n    </tr>\n    <tr>\n      <th>42</th>\n      <td>42</td>\n      <td>conf/sigmod/ReveszCKLLW00</td>\n      <td>bx_A3QEqKhEJ</td>\n      <td>the mlpq/gis constraint database system</td>\n      <td>p revesz, r chen, p kanjamala, y li, y liu, y wang</td>\n      <td>SIGMOD Conference</td>\n      <td>the mlpq/gis constraint database</td>\n      <td>pz revesz, r chen, p kanjamala, y li, y liu, y</td>\n      <td>Proceedings of SIGMOD 2000,</td>\n    </tr>\n    <tr>\n      <th>43</th>\n      <td>43</td>\n      <td>journals/sigmod/AbererCH02</td>\n      <td>YUtQ8KXNntYJ</td>\n      <td>a framework for semantic gossiping</td>\n      <td>k aberer, p cudrï¿½-mauroux, m hauswirth</td>\n      <td>SIGMOD Record</td>\n      <td>m. hauswirth a framework for semantic gossiping</td>\n      <td>k aberer, p cudrã©-mauroux</td>\n      <td>SIGMOD</td>\n    </tr>\n    <tr>\n      <th>44</th>\n      <td>44</td>\n      <td>conf/vldb/IshikawaSF98</td>\n      <td>V3c_ivUAs34J</td>\n      <td>mindreader: querying databases through multiple examples</td>\n      <td>y ishikawa, r subramanya, c faloutsos</td>\n      <td>VLDB</td>\n      <td>mindreader: querying databases through</td>\n      <td>y ishikawa, r subramanya, ps center, c faloutsos</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>45</th>\n      <td>45</td>\n      <td>conf/sigmod/DeshpandeNGS03</td>\n      <td>aBiqHXGXoJEJ</td>\n      <td>cache-and-query for wide area sensor databases</td>\n      <td>a deshpande, s nath, p gibbons, s seshan</td>\n      <td>SIGMOD Conference</td>\n      <td>cache-and-query for wide area sensor network</td>\n      <td>a deshpande, s nath, pb gibbons, s seshan</td>\n      <td>SIGMOD Conference,</td>\n    </tr>\n    <tr>\n      <th>46</th>\n      <td>46</td>\n      <td>conf/vldb/ManolescuFKXO00</td>\n      <td>0mGtqfQNJ-4J</td>\n      <td>agora: living with xml and relational</td>\n      <td>i manolescu, d florescu, d kossmann, f xhumari, d olteanu</td>\n      <td>VLDB</td>\n      <td>living with xml and relational</td>\n      <td>f manolescu, d florescu, d kossmann, f xhumari, d</td>\n      <td>Proc. VLDB conference,</td>\n    </tr>\n    <tr>\n      <th>47</th>\n      <td>47</td>\n      <td>journals/sigmod/DarwenD95</td>\n      <td>Xa_whOlvG7gJ</td>\n      <td>the third manifesto</td>\n      <td>h darwen, c date</td>\n      <td>SIGMOD Record</td>\n      <td>the third manifesto acm sigmod record</td>\n      <td>h darwen, cj date</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>48</th>\n      <td>48</td>\n      <td>journals/vldb/BeckerGOSW96</td>\n      <td>K53EIcG4ta8J</td>\n      <td>an asymptotically optimal multiversion b-tree</td>\n      <td>b becker, s gschwind, t ohler, b seeger, p widmayer</td>\n      <td>VLDB J.</td>\n      <td>ban asymptotically optimal multiversion b-tree</td>\n      <td>b becker, s gschwind, t ohler, b seeger, p</td>\n      <td>The VLDB</td>\n    </tr>\n    <tr>\n      <th>49</th>\n      <td>49</td>\n      <td>conf/vldb/BridgeJKLLM97</td>\n      <td>8SYZpKAeTwQJ</td>\n      <td>the oracle universal server buffer</td>\n      <td>w bridge, a joshi, m keihl, t lahiri, j loaiza, n macnaughton</td>\n      <td>VLDB</td>\n      <td>the oracle universal server buer</td>\n      <td>w bridge, a joshi, m keihl, t lahiri, j loaiza, n</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 17
    }
   ],
   "source": [
    "dbg = em.debug_blocker(G, dblp, scholar, output_size=50)\n",
    "dbg"
   ]
  },
  {
   "source": [
    "### Save dataset and set aside sample for labeling\n",
    "保存数据集并且取部分样本作为标记"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# save data/保存数据\n",
    "file_name = 'datasets/DBLP-Scholar/candidates.csv'\n",
    "G.to_csv(file_name, sep=\",\")\n",
    "\n",
    "# Take sample to label/取样本为标签\n",
    "s_file_name = \"datasets/DBLP-Scholar/candidates_sample.csv\"\n",
    "S = em.sample_table(G, 1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "    _id                           ltable_id  \\\n",
       "5     5                    conf/sigmod/2000   \n",
       "6     6                    conf/sigmod/2000   \n",
       "16   16                    conf/sigmod/2000   \n",
       "35   35                    conf/sigmod/2002   \n",
       "40   40                    conf/sigmod/2002   \n",
       "42   42                    conf/sigmod/2002   \n",
       "51   51                    conf/sigmod/2002   \n",
       "55   55                    conf/sigmod/2002   \n",
       "81   81                    conf/sigmod/2003   \n",
       "93   93  conf/sigmod/AbadiCCCCEGHMRSSTXYZ03   \n",
       "\n",
       "                                                                                              rtable_id  \\\n",
       "5                                                                                          CpwYam_LYyEJ   \n",
       "6                                                                                          F9KzvnDpCPUJ   \n",
       "16                                                 url:http://portal.acm.org/citation.cfm%3Fid%3D191839   \n",
       "35                                                                                         QnMrKru1S1MJ   \n",
       "40                     url:http://portal.acm.org/citation.cfm%3Fcoll%3DGUIDE%26dl%3DGUIDE%26id%3D276377   \n",
       "42  url:http://portal.acm.org/citation.cfm%3Fid%3D16894%26dl%3DGUIDE%26dl%3DGUIDE%26type%3Dproceedin...   \n",
       "51                                            url:http://portal.acm.org/citation.cfm%3Fid%3D38713.38742   \n",
       "55                                                                                         vfTH6FzOpDAJ   \n",
       "81  url:http://portal.acm.org/citation.cfm%3Fid%3D50202%26dl%3DGUIDE%26dl%3DGUIDE%26type%3Dproceedin...   \n",
       "93                                                                                         eBnT7lhV2LwJ   \n",
       "\n",
       "                                                                                           ltable_title  \\\n",
       "5   proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...   \n",
       "6   proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...   \n",
       "16  proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...   \n",
       "35  proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...   \n",
       "40  proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...   \n",
       "42  proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...   \n",
       "51  proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...   \n",
       "55  proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...   \n",
       "81  proceedings of the 2003 acm sigmod international conference on management of data, san diego, ca...   \n",
       "93                                                              aurora: a data stream management system   \n",
       "\n",
       "                                                                                         ltable_authors  \\\n",
       "5                                                                                                   NaN   \n",
       "6                                                                                                   NaN   \n",
       "16                                                                                                  NaN   \n",
       "35                                                                                                  NaN   \n",
       "40                                                                                                  NaN   \n",
       "42                                                                                                  NaN   \n",
       "51                                                                                                  NaN   \n",
       "55                                                                                                  NaN   \n",
       "81                                                                                                  NaN   \n",
       "93  d abadi, d carney, u ï¿½etintemel, m cherniack, c convey, c erwin, e galvez, m hatoun, a maskey,...   \n",
       "\n",
       "    ltable_year  \\\n",
       "5          2000   \n",
       "6          2000   \n",
       "16         2000   \n",
       "35         2002   \n",
       "40         2002   \n",
       "42         2002   \n",
       "51         2002   \n",
       "55         2002   \n",
       "81         2003   \n",
       "93         2003   \n",
       "\n",
       "                                                                                           rtable_title  \\\n",
       "5   on predicting data cache behavior for real {time systems. in proceedings of the acm sigplan work...   \n",
       "6   similarity-based queries for time series data. in: proceedings of the acm sigmod conference on t...   \n",
       "16                                        source international conference on management of data archive   \n",
       "35  blocation-based spatial queries,^ in proceedings of the acm international conference on manageme...   \n",
       "40       full text pdf format pdf (424 kb)source international conference on management of data archive   \n",
       "42                                        source international conference on management of data archive   \n",
       "51      full text pdf format pdf (1.24 mb)source international conference on management of data archive   \n",
       "55  data modeling of time-based media. in: proceedings of the 1994 acm sigmod international conferen...   \n",
       "81                                        source international conference on management of data archive   \n",
       "93                                           aurora: a data stream management system (demo description)   \n",
       "\n",
       "                                         rtable_authors  rtable_year  \n",
       "5                                c ferdinand, r wilhelm            0  \n",
       "6                                 d raflei, a mendelzon         1997  \n",
       "16                             rt snodgrass, m winslett         1994  \n",
       "35            j zhang, m zhu, d papadias, y tao, dl lee         2003  \n",
       "40  c olston, a woodruff, a aiken, m chu, v ercegovac,          1998  \n",
       "42                                            c zaniolo         1986  \n",
       "51                             h garcia-molina, k salem         1987  \n",
       "55                s gibbs, c breiteneder, d tsichritzis         1994  \n",
       "81                                   h boral, pa larson         1988  \n",
       "93    d abadi, d carney, u cetintemel, m cherniack, c              0  "
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>_id</th>\n      <th>ltable_id</th>\n      <th>rtable_id</th>\n      <th>ltable_title</th>\n      <th>ltable_authors</th>\n      <th>ltable_year</th>\n      <th>rtable_title</th>\n      <th>rtable_authors</th>\n      <th>rtable_year</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>5</th>\n      <td>5</td>\n      <td>conf/sigmod/2000</td>\n      <td>CpwYam_LYyEJ</td>\n      <td>proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>on predicting data cache behavior for real {time systems. in proceedings of the acm sigplan work...</td>\n      <td>c ferdinand, r wilhelm</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>6</td>\n      <td>conf/sigmod/2000</td>\n      <td>F9KzvnDpCPUJ</td>\n      <td>proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>similarity-based queries for time series data. in: proceedings of the acm sigmod conference on t...</td>\n      <td>d raflei, a mendelzon</td>\n      <td>1997</td>\n    </tr>\n    <tr>\n      <th>16</th>\n      <td>16</td>\n      <td>conf/sigmod/2000</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fid%3D191839</td>\n      <td>proceedings of the 2000 acm sigmod international conference on management of data, may 16-18, 20...</td>\n      <td>NaN</td>\n      <td>2000</td>\n      <td>source international conference on management of data archive</td>\n      <td>rt snodgrass, m winslett</td>\n      <td>1994</td>\n    </tr>\n    <tr>\n      <th>35</th>\n      <td>35</td>\n      <td>conf/sigmod/2002</td>\n      <td>QnMrKru1S1MJ</td>\n      <td>proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>blocation-based spatial queries,^ in proceedings of the acm international conference on manageme...</td>\n      <td>j zhang, m zhu, d papadias, y tao, dl lee</td>\n      <td>2003</td>\n    </tr>\n    <tr>\n      <th>40</th>\n      <td>40</td>\n      <td>conf/sigmod/2002</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fcoll%3DGUIDE%26dl%3DGUIDE%26id%3D276377</td>\n      <td>proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>full text pdf format pdf (424 kb)source international conference on management of data archive</td>\n      <td>c olston, a woodruff, a aiken, m chu, v ercegovac,</td>\n      <td>1998</td>\n    </tr>\n    <tr>\n      <th>42</th>\n      <td>42</td>\n      <td>conf/sigmod/2002</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fid%3D16894%26dl%3DGUIDE%26dl%3DGUIDE%26type%3Dproceedin...</td>\n      <td>proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>source international conference on management of data archive</td>\n      <td>c zaniolo</td>\n      <td>1986</td>\n    </tr>\n    <tr>\n      <th>51</th>\n      <td>51</td>\n      <td>conf/sigmod/2002</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fid%3D38713.38742</td>\n      <td>proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>full text pdf format pdf (1.24 mb)source international conference on management of data archive</td>\n      <td>h garcia-molina, k salem</td>\n      <td>1987</td>\n    </tr>\n    <tr>\n      <th>55</th>\n      <td>55</td>\n      <td>conf/sigmod/2002</td>\n      <td>vfTH6FzOpDAJ</td>\n      <td>proceedings of the 2002 acm sigmod international conference on management of data, madison, wisc...</td>\n      <td>NaN</td>\n      <td>2002</td>\n      <td>data modeling of time-based media. in: proceedings of the 1994 acm sigmod international conferen...</td>\n      <td>s gibbs, c breiteneder, d tsichritzis</td>\n      <td>1994</td>\n    </tr>\n    <tr>\n      <th>81</th>\n      <td>81</td>\n      <td>conf/sigmod/2003</td>\n      <td>url:http://portal.acm.org/citation.cfm%3Fid%3D50202%26dl%3DGUIDE%26dl%3DGUIDE%26type%3Dproceedin...</td>\n      <td>proceedings of the 2003 acm sigmod international conference on management of data, san diego, ca...</td>\n      <td>NaN</td>\n      <td>2003</td>\n      <td>source international conference on management of data archive</td>\n      <td>h boral, pa larson</td>\n      <td>1988</td>\n    </tr>\n    <tr>\n      <th>93</th>\n      <td>93</td>\n      <td>conf/sigmod/AbadiCCCCEGHMRSSTXYZ03</td>\n      <td>eBnT7lhV2LwJ</td>\n      <td>aurora: a data stream management system</td>\n      <td>d abadi, d carney, u ï¿½etintemel, m cherniack, c convey, c erwin, e galvez, m hatoun, a maskey,...</td>\n      <td>2003</td>\n      <td>aurora: a data stream management system (demo description)</td>\n      <td>d abadi, d carney, u cetintemel, m cherniack, c</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {},
     "execution_count": 19
    }
   ],
   "source": [
    "S.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "1000\n"
     ]
    }
   ],
   "source": [
    "# 插入labels\n",
    "labels = []\n",
    "# change DataFrame into np.array/将pandas转换为numpy,方便处理\n",
    "map_arr = np.array(dblp_scholar_map)\n",
    "s_arr = np.array(S) \n",
    "l_flag = False\n",
    "for i in range(len(s_arr)):\n",
    "    for j in range(len(map_arr)):\n",
    "        if s_arr[i][1] == map_arr[j][0] and s_arr[i][2] == map_arr[j][1]:\n",
    "            labels.append('1')\n",
    "            l_flag = True\n",
    "            # print(\"find compitable!\")\n",
    "            continue\n",
    "    if not l_flag:\n",
    "        labels.append('0')\n",
    "    l_flag = False\n",
    "print(len(labels))\n",
    "S.insert(loc=9, column='label', value=labels)\n",
    "S.to_csv(s_file_name, sep=\",\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Split samples into I and J\n500\n500\n"
     ]
    }
   ],
   "source": [
    "i_file = \"datasets/DBLP-Scholar/I.csv\"\n",
    "j_file = \"datasets/DBLP-Scholar/J.csv\"\n",
    "\n",
    "if not os.path.isfile(i_file):\n",
    "    IJ = em.split_train_test(S, train_proportion=0.5, random_state=0)\n",
    "    I = IJ['train']\n",
    "    J = IJ['test']\n",
    "    I.to_csv(i_file, sep=\",\")\n",
    "    J.to_csv(j_file, sep=\",\")\n",
    "    print(\"Split samples into I and J\")\n",
    "else:\n",
    "    I = em.read_csv_metadata(i_file, key=\"_id\",\n",
    "                             ltable=dblp, rtable=scholar,\n",
    "                             fk_ltable=\"ltable_id\", fk_rtable='rtable_id')\n",
    "    J = em.read_csv_metadata(j_file, key=\"_id\",\n",
    "                             ltable=dblp, rtable=scholar,\n",
    "                         fk_ltable='ltable_id', fk_rtable='rtable_id')\n",
    "    print(\"Reading I and J from files\")                         \n",
    "print(len(I))\n",
    "print(len(J))"
   ]
  },
  {
   "source": [
    "参考文献:\n",
    "\n",
    "[1] Python-使用Magellan进行数据匹配总结.https://blog.csdn.net/weixin_34124939/article/details/86357847[OL].2020,11,12.\n"
   ],
   "cell_type": "markdown",
   "metadata": {}
  }
 ]
}